1 Differences in indicators between academic status groups

1.1 Data preparation

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at /Users/Jo/OneDrive/1_Hertie Studies/Thesis/Hertie-Thesis-Mehler
library(ggplot2)
library(descr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggpubr)
library(rstatix)
## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter
theme_set(theme_minimal())

data <- read_csv(here("data", "data_combined.csv"), 
                 col_types = cols(
                   academic_status = col_factor(levels = c("non-academic", "academic")),
                   educ_cat = col_factor(levels = c("Low", "Intermediate", "High")), 
                   gender = col_factor(levels = c("Male", "Female", "Other")), 
                   monority_cat = col_factor(levels = c("Non-minority", "Minority")),
                   age_cat = col_factor(levels = c("18-29", "30-49", "50-69", "70+")),
                   polinterest_cat_3 = col_factor(levels = c("Low", "Intermediate", "High")),
                   empathy_pc_cat = col_factor(levels = c("Less empathetic", "More empathetic")),
                   exp_hate_speech_cat = col_factor(levels = c("Less experience", "More experience")), 
                   exp_hostile_engagement_cat = col_factor(levels = c("Less experience", "More experience")), 
                   cluster = col_factor(levels = c("Denying/Nonsense", "Intent-based", "Harms-based", "Harms-based narrow")),
                   leftright_cat = col_factor(levels = c("liberal/left", "moderate/center", "conservative/right")),
                   commitment_cat = col_factor(levels = c("Below average", "Above Average"))
                 ))
## Warning: The following named parsers don't match the column names: monority_cat
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
# filter out NA's in academic status and extreme outliers
data <- data %>% filter(!is.na(educ_cat), !is.na(academic_status), text_length < 1000, readability_score < 40)

indicators <- c("text_length_log2", "readability_score", "leftright_pred_error", "cluster")

1.2 Indicators by Academic Status

p <- ggpairs(data,                 # Data frame
        columns = indicators, # Columns
        aes(color = academic_status,  # Color by group (cat. variable)
            alpha = 0.5),
        upper = list(continuous = wrap("cor", size = 2.7)
        ))

p
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 223 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 223 rows containing missing values
## Warning: Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_bin()`).

1.2.1 Example how to extract single plots

getPlot(p, 1, 4)

1.2.2 Distribution of Clusters

getPlot(p, 4, 4)

num_academics <- data %>% filter(academic_status == "academic") %>% nrow()
num_non_academics <- data %>% filter(academic_status == "non-academic") %>% nrow()

data_cluster <- data %>% 
  select(academic_status, cluster) %>% 
  group_by(academic_status, cluster) %>% 
  count() %>% 
  mutate(share = ifelse(academic_status == "academic", n/num_academics, n/num_non_academics)) %>% 
#  arrange(cluster) %>%
  ungroup() %>% 
  # Create an ordering variable based on descending shares within each academic_status
  arrange(academic_status, desc(share)) %>% 
  mutate(order = rank(-share)) %>%
  mutate(cluster = factor(cluster, levels = unique(cluster)))

# Bar plot with error bars with and legend titles
data_cluster %>%
  ggplot(aes(x = reorder(cluster, order), y = share, fill = academic_status)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.75)) +
  # Custom colors for each academic_status
  scale_fill_manual(values = c("academic" = "skyblue", "non-academic" = "darkgrey")) +
  labs(title = "Cluster Distribution by Group", 
       x = "Cluster", 
       y = "Frequency",
       fill = "Academic Status") + # Legend Title for fill 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggsave("3-2-cluster_distribution.png", path = here("figures"), height = 4, width = 4)

1.3 Textlength and Academic Status

1.3.1 Mean, SD, T-Test

# logged textlength
data %>%
  group_by(academic_status) %>%
  get_summary_stats(text_length_log2, type = "mean_sd")
## # A tibble: 2 × 5
##   academic_status variable             n  mean    sd
##   <fct>           <fct>            <dbl> <dbl> <dbl>
## 1 non-academic    text_length_log2   458  6.54  1.30
## 2 academic        text_length_log2   605  6.54  1.30
# Base R version
res <- t.test(text_length ~ academic_status, data = data)
res
## 
##  Welch Two Sample t-test
## 
## data:  text_length by academic_status
## t = 0.26877, df = 967.04, p-value = 0.7882
## alternative hypothesis: true difference in means between group non-academic and group academic is not equal to 0
## 95 percent confidence interval:
##  -12.43859  16.38641
## sample estimates:
## mean in group non-academic     mean in group academic 
##                   132.6681                   130.6942
# statix version
stat.test <- data %>% 
  t_test(text_length ~ academic_status, detailed = TRUE) %>%
  add_significance()
stat.test
## # A tibble: 1 × 16
##   estimate estimate1 estimate2 .y.     group1 group2    n1    n2 statistic     p
##      <dbl>     <dbl>     <dbl> <chr>   <chr>  <chr>  <int> <int>     <dbl> <dbl>
## 1     1.97      133.      131. text_l… non-a… acade…   458   605     0.269 0.788
## # ℹ 6 more variables: df <dbl>, conf.low <dbl>, conf.high <dbl>, method <chr>,
## #   alternative <chr>, p.signif <chr>
# Create a box-plot
bxp <- ggboxplot(
  data, x = "academic_status", y = "text_length", 
  ylab = "Text Length", xlab = "Groups", add = "jitter"
  )

# Add p-value and significance levels
stat.test <- stat.test %>% add_xy_position(x = "academic_status")
bxp + 
  stat_pvalue_manual(stat.test, tip.length = 0) +
  labs(subtitle = get_test_label(stat.test, detailed = TRUE))

1.4 Readability and Academic Status

1.4.1 T-Test

# Base R version
res <- t.test(readability_score ~ academic_status, data = data)
res
## 
##  Welch Two Sample t-test
## 
## data:  readability_score by academic_status
## t = -2.1874, df = 990.53, p-value = 0.02895
## alternative hypothesis: true difference in means between group non-academic and group academic is not equal to 0
## 95 percent confidence interval:
##  -1.39185278 -0.07548526
## sample estimates:
## mean in group non-academic     mean in group academic 
##                   11.14713                   11.88080
# t-test
stat.test <- data %>% 
  t_test(readability_score ~ academic_status, detailed = TRUE) %>%
  add_significance()
stat.test
## # A tibble: 1 × 16
##   estimate estimate1 estimate2 .y.    group1 group2    n1    n2 statistic      p
##      <dbl>     <dbl>     <dbl> <chr>  <chr>  <chr>  <int> <int>     <dbl>  <dbl>
## 1   -0.734      11.1      11.9 reada… non-a… acade…   458   605     -2.19 0.0289
## # ℹ 6 more variables: df <dbl>, conf.low <dbl>, conf.high <dbl>, method <chr>,
## #   alternative <chr>, p.signif <chr>
# Create a box-plot
bxp <- ggboxplot(
  data, x = "academic_status", y = "readability_score", 
  ylab = "readability_score", xlab = "Groups", add = "jitter"
  )

# Add p-value and significance levels
stat.test <- stat.test %>% add_xy_position(x = "academic_status")
bxp + 
  stat_pvalue_manual(stat.test, tip.length = 0) +
  labs(subtitle = get_test_label(stat.test, detailed = TRUE))

1.5 Cluster Group and Academic Status: Chi-Square Test

Chi-square test of independence: interested in testing whether there’s a significant association between the two categorical variables (group and cluster)

# Create the contingency table
table <- table(data$academic_status, data$cluster)
table
##               
##                Denying/Nonsense Intent-based Harms-based Harms-based narrow
##   non-academic               87          114         182                 75
##   academic                   95          194         192                124
# Conduct Chi-square test of independence
chi_test_result <- chisq.test(table)

# Print the results
print(chi_test_result)
## 
##  Pearson's Chi-squared test
## 
## data:  table
## X-squared = 13.391, df = 3, p-value = 0.003862

The p-value is 0.06, which is slightly above my alpha level of 0.05. This means my result is not significant (Aktueller Stand: 28. März mit ca. 1000 observations).

# Simple bar plot
barplot(table, beside=TRUE, legend=TRUE)

1.5.1 Distribution and Shares (just as extra)

crosstab(data$cluster, data$academic_status, prop.r = TRUE)
##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |             Row Percent | 
## |-------------------------|
## 
## =====================================================
##                       data$academic_status
## data$cluster          non-academic   academic   Total
## -----------------------------------------------------
## Denying/Nonsense               87         95     182 
##                              47.8%      52.2%   17.1%
## -----------------------------------------------------
## Intent-based                  114        194     308 
##                              37.0%      63.0%   29.0%
## -----------------------------------------------------
## Harms-based                   182        192     374 
##                              48.7%      51.3%   35.2%
## -----------------------------------------------------
## Harms-based narrow             75        124     199 
##                              37.7%      62.3%   18.7%
## -----------------------------------------------------
## Total                         458        605    1063 
## =====================================================
crosstab(data$cluster, data$academic_status, prop.c = TRUE)

##    Cell Contents 
## |-------------------------|
## |                   Count | 
## |          Column Percent | 
## |-------------------------|
## 
## =====================================================
##                       data$academic_status
## data$cluster          non-academic   academic   Total
## -----------------------------------------------------
## Denying/Nonsense               87         95     182 
##                              19.0%      15.7%        
## -----------------------------------------------------
## Intent-based                  114        194     308 
##                              24.9%      32.1%        
## -----------------------------------------------------
## Harms-based                   182        192     374 
##                              39.7%      31.7%        
## -----------------------------------------------------
## Harms-based narrow             75        124     199 
##                              16.4%      20.5%        
## -----------------------------------------------------
## Total                         458        605    1063 
##                              43.1%      56.9%        
## =====================================================
ggplot(data, aes(x = cluster, fill = academic_status)) +
  geom_bar(position = "dodge") +
  labs(title = "Number of Observations per Cluster, Grouped by Academic Status",
       x = "Cluster",
       y = "Number of Observations") +
  theme_minimal()

1.6 Prediction of Political Orientation and Academic Status

1.6.1 T-Test

# Base R version
res <- t.test(leftright_pred_error ~ academic_status, data = data)
res
## 
##  Welch Two Sample t-test
## 
## data:  leftright_pred_error by academic_status
## t = 0.1253, df = 781.05, p-value = 0.9003
## alternative hypothesis: true difference in means between group non-academic and group academic is not equal to 0
## 95 percent confidence interval:
##  -0.2201679  0.2501912
## sample estimates:
## mean in group non-academic     mean in group academic 
##                   2.150036                   2.135025
# t-test
stat.test <- data %>% 
  t_test(leftright_pred_error ~ academic_status, detailed = TRUE) %>%
  add_significance()
stat.test
## # A tibble: 1 × 16
##   estimate estimate1 estimate2 .y.     group1 group2    n1    n2 statistic     p
##      <dbl>     <dbl>     <dbl> <chr>   <chr>  <chr>  <int> <int>     <dbl> <dbl>
## 1   0.0150      2.15      2.14 leftri… non-a… acade…   362   478     0.125   0.9
## # ℹ 6 more variables: df <dbl>, conf.low <dbl>, conf.high <dbl>, method <chr>,
## #   alternative <chr>, p.signif <chr>
# Create a box-plot
bxp <- ggboxplot(
  data, x = "academic_status", y = "leftright_pred_error", 
  ylab = "leftright pred score", xlab = "Groups", add = "jitter"
  )

# Add p-value and significance levels
stat.test <- stat.test %>% add_xy_position(x = "academic_status")
bxp + 
  stat_pvalue_manual(stat.test, tip.length = 0) +
  labs(subtitle = get_test_label(stat.test, detailed = TRUE))
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).

2 Differences in indicators between three edcucation levels

2.1 Indicators by Education Category

p <- ggpairs(data,                 # Data frame
        columns = indicators, # Columns
        aes(color = educ_cat,  # Color by group (cat. variable)
            alpha = 0.5),
        upper = list(continuous = wrap("cor", size = 2.7)
        ))

p
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 223 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 223 rows containing missing values
## Warning: Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_bin()`).

2.1.1 Example how to extract single plots

getPlot(p, 1, 4)

2.2 Textlength and Education Cat

# Create the box plot
p <- ggplot(data, aes(x = educ_cat, y = text_length_log2)) +
  geom_point(position = position_jitter(width = 0.2), alpha = 0.5, color = "skyblue") +
  geom_boxplot(outliers = FALSE) +
  theme_minimal() +
  labs(title = "Comparison of Text Length by Education",
       x = "Academic Status",
       y = "Text Length")

# Print the plot
print(p)

2.3 Readability and educ_cat

# Create the box plot
p <- ggplot(data, aes(x = educ_cat, y = readability_score)) +
  geom_point(position = position_jitter(width = 0.2), alpha = 0.5, color = "skyblue") +
  geom_boxplot(outliers = FALSE) +
  theme_minimal() +
  labs(title = "Comparison of Readability by Education",
       x = "Academic Status",
       y = "Readability Score")

# Print the plot
print(p)

2.4 Prediction of Political Orientation and educ_cat

# Create the box plot
p <- ggplot(data, aes(x = educ_cat, y = leftright_pred_error)) +
  geom_point(position = position_jitter(width = 0.2), alpha = 0.5, color = "skyblue") +
  geom_boxplot(outliers = FALSE) +
  theme_minimal() +
  labs(title = "Comparison of Prediction of Political Orientation by educ_cat",
       x = "Education",
       y = "Readability Score")

# Print the plot
print(p)
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 223 rows containing missing values or values outside the scale range
## (`geom_point()`).

3 Density Curves

3.1 Density Curves for Academic Status

# Define the function
plot_density_comparison <- function(data, numeric_var_name, group_var_name = "academic_status") {
  # Ensure the variable names are non-empty and exist in the dataframe
  if (!numeric_var_name %in% names(data)) {
    stop("Numeric variable name does not exist in the dataframe.")
  }
  
  if (!group_var_name %in% names(data)) {
    stop("Group variable name does not exist in the dataframe.")
  }
  
  # Use ggplot to create the density plot
  p <- ggplot(data, aes_string(x = numeric_var_name, fill = group_var_name)) +
    geom_density(alpha = 0.5) + # Adjust alpha for transparency
    labs(x = numeric_var_name, y = "Density") + # Labels
    ggtitle(paste("Density of", numeric_var_name, "by", group_var_name)) +
    scale_fill_manual(values = c("academic" = "skyblue", "non-academic" = "pink")) + # Customize colors
    theme_minimal() # Use a minimal theme for a nicer plot
  
  return(p)
}

# Example usage with your dataframe 'data' and the variable 'readability_score'
plot1 <- plot_density_comparison(data, "text_length_log2")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# For readability_score
plot2 <- plot_density_comparison(data, "readability_score")

# For leftright_pred_error
plot3 <- plot_density_comparison(data, "leftright_pred_error")

# add type/cluster indicator later

plot1

plot2

plot3
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_density()`).

3.2 Density Curves for Education Category

# Define the function
plot_density_comparison <- function(data, numeric_var_name, group_var_name = "educ_cat") {
  # Ensure the variable names are non-empty and exist in the dataframe
  if (!numeric_var_name %in% names(data)) {
    stop("Numeric variable name does not exist in the dataframe.")
  }
  
  if (!group_var_name %in% names(data)) {
    stop("Group variable name does not exist in the dataframe.")
  }
  
  # Use ggplot to create the density plot
  p <- ggplot(data, aes_string(x = numeric_var_name, fill = group_var_name)) +
    geom_density(alpha = 0.5) + # Adjust alpha for transparency
    labs(x = numeric_var_name, y = "Density") + # Labels
    ggtitle(paste("Density of", numeric_var_name, "by", group_var_name)) +
    scale_fill_manual(values = c("High" = "skyblue", "Intermediate" = "lightgreen", "Low" = "pink")) + # Customize colors
    theme_minimal() # Use a minimal theme for a nicer plot
  
  return(p)
}

# Example usage with your dataframe 'data' and the variable 'text_length_log2'
plot5 <- plot_density_comparison(data, "text_length_log2")

# For readability_score
plot6 <- plot_density_comparison(data, "readability_score")

# For leftright_pred_error
plot7 <- plot_density_comparison(data, "leftright_pred_error")

# add type/cluster indicator later

plot5

plot6

plot7
## Warning: Removed 223 rows containing non-finite outside the scale range
## (`stat_density()`).

3.2.1 Facet Wrap of the Plots

DID NOT WORK YET!!

#library(patchwork)
#
## Assuming plot1, plot2, plot3, plot5, plot6, plot7 are your ggplot objects
#plot_layout <- plot1 + plot2 + plot3 + plot5 + plot6 + plot7 +
#  plot_layout(guides = "collect") &
#  theme(legend.position = "bottom")
#
#plot_layout <- plot1 + plot2 + plot3 + plot5 + plot6 + plot7 +
#  plot_layout(ncol = 2) # This is the correct usage to specify layout options
#
#
#plot_layout